In [1]:
#https://github.com/Avik-Jain/100-Days-Of-ML-Code/blob/master/Code/Day2_Simple_Linear_Regression.md

In [13]:
# Step 1: Data Preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv('studentscores.csv')
X = dataset.iloc[ : ,   : 1 ].values
Y = dataset.iloc[ : , 1 ].values

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 1/4, random_state = 0)

In [14]:
print(X_train, '\n\n', Y_train)


[[7.8]
 [6.9]
 [1.1]
 [5.1]
 [7.7]
 [3.3]
 [8.3]
 [9.2]
 [6.1]
 [3.5]
 [2.7]
 [5.5]
 [2.7]
 [8.5]
 [2.5]
 [4.8]
 [8.9]
 [4.5]] 

 [86 76 17 47 85 42 81 88 67 30 25 60 30 75 21 54 95 41]

In [15]:
print(X_test, '\n\n', Y_test)


[[1.5]
 [3.2]
 [7.4]
 [2.5]
 [5.9]
 [3.8]
 [1.9]] 

 [20 27 69 30 62 35 24]

In [16]:
plt.scatter(X_train , Y_train, color ='red')


Out[16]:
<matplotlib.collections.PathCollection at 0x7fa3778f8748>

In [17]:
# Step 2: Fitting Simple Linear Regression Model to the training set

from sklearn.linear_model import LinearRegression  # Ordinary least squares Linear Regression

regressor = LinearRegression()
regressor = regressor.fit(X_train, Y_train)  # Fit linear model

In [19]:
regressor.score(X_train, Y_train)  # Returns the coefficient of determination R^2 of the prediction.


Out[19]:
0.9484509249326872

The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.


In [21]:
# Estimated coefficients for the linear regression problem. 
# If multiple targets are passed during the fit (y 2D), this is a 2D array of shape (n_targets, n_features), 
# while if only one target is passed, this is a 1D array of length n_features.
regressor.coef_


Out[21]:
array([9.94167834])

In [24]:
regressor.intercept_  # Independent term in the linear model.


Out[24]:
1.9322042531516601

In [22]:
# Step 3: Predecting the Result
Y_pred = regressor.predict(X_test)

In [25]:
print(Y_pred)


[16.84472176 33.74557494 75.50062397 26.7864001  60.58810646 39.71058194
 20.8213931 ]

In [26]:
#Step 4: Visualization
# Visualising the Training results

plt.scatter(X_train , Y_train, color = 'red')
plt.plot(X_train , regressor.predict(X_train), color ='blue')


Out[26]:
[<matplotlib.lines.Line2D at 0x7fa377506cc0>]

In [27]:
# Visualizing the test results

plt.scatter(X_test , Y_test, color = 'red')
plt.plot(X_test , regressor.predict(X_test), color ='blue')


Out[27]:
[<matplotlib.lines.Line2D at 0x7fa3774c1400>]

In [42]:
X_test


Out[42]:
array([[1.5],
       [3.2],
       [7.4],
       [2.5],
       [5.9],
       [3.8],
       [1.9]])

In [43]:
Y_test


Out[43]:
array([20, 27, 69, 30, 62, 35, 24])

In [12]:
regressor.score(X_train, Y_train)


Out[12]:
0.9484509249326872

In [28]:
regressor.predict(X_train)


Out[28]:
array([79.4772953 , 70.5297848 , 12.86805043, 52.63476378, 78.48312747,
       34.73974277, 84.44813447, 93.39564498, 62.57644212, 36.72807844,
       28.77473577, 56.61143512, 28.77473577, 86.43647014, 26.7864001 ,
       49.65226028, 90.41314147, 46.66975678])

In [40]:
regressor.predict(np.array([[2]]))  # o valor que vai dentro dos dois colchetes é o valor de x na reta! o retorno é o y


Out[40]:
array([21.81556093])

In [41]:
regressor.predict(np.array([[5]]))


Out[41]:
array([51.64059595])

In [45]:
regressor.predict(np.array([[1.5]]))  # valor na reta! Não nos pontos de treino ou teste.


Out[45]:
array([16.84472176])

Outro estudo


In [46]:
# https://www.kdnuggets.com/2019/03/beginners-guide-linear-regression-python-scikit-learn.html

In [47]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline

In [48]:
dataset = pd.read_csv('Weather.csv')


/home/miky/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (7,8,18,25) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [49]:
dataset.head()


Out[49]:
STA Date Precip WindGustSpd MaxTemp MinTemp MeanTemp Snowfall PoorWeather YR ... FB FTI ITH PGT TSHDSBRSGF SD3 RHX RHN RVG WTE
0 10001 1942-7-1 1.016 NaN 25.555556 22.222222 23.888889 0 NaN 42 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 10001 1942-7-2 0 NaN 28.888889 21.666667 25.555556 0 NaN 42 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 10001 1942-7-3 2.54 NaN 26.111111 22.222222 24.444444 0 NaN 42 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 10001 1942-7-4 2.54 NaN 26.666667 22.222222 24.444444 0 NaN 42 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 10001 1942-7-5 0 NaN 26.666667 21.666667 24.444444 0 NaN 42 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 31 columns


In [50]:
dataset.shape


Out[50]:
(119040, 31)

In [51]:
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119040 entries, 0 to 119039
Data columns (total 31 columns):
STA            119040 non-null int64
Date           119040 non-null object
Precip         119040 non-null object
WindGustSpd    532 non-null float64
MaxTemp        119040 non-null float64
MinTemp        119040 non-null float64
MeanTemp       119040 non-null float64
Snowfall       117877 non-null object
PoorWeather    34237 non-null object
YR             119040 non-null int64
MO             119040 non-null int64
DA             119040 non-null int64
PRCP           117108 non-null object
DR             533 non-null float64
SPD            532 non-null float64
MAX            118566 non-null float64
MIN            118572 non-null float64
MEA            118542 non-null float64
SNF            117877 non-null object
SND            5563 non-null float64
FT             0 non-null float64
FB             0 non-null float64
FTI            0 non-null float64
ITH            0 non-null float64
PGT            525 non-null float64
TSHDSBRSGF     34237 non-null object
SD3            0 non-null float64
RHX            0 non-null float64
RHN            0 non-null float64
RVG            0 non-null float64
WTE            0 non-null float64
dtypes: float64(20), int64(4), object(7)
memory usage: 28.2+ MB

In [52]:
dataset.describe()


Out[52]:
STA WindGustSpd MaxTemp MinTemp MeanTemp YR MO DA DR SPD ... FT FB FTI ITH PGT SD3 RHX RHN RVG WTE
count 119040.000000 532.000000 119040.000000 119040.000000 119040.000000 119040.000000 119040.000000 119040.000000 533.000000 532.000000 ... 0.0 0.0 0.0 0.0 525.000000 0.0 0.0 0.0 0.0 0.0
mean 29659.435795 37.774534 27.045111 17.789511 22.411631 43.805284 6.726016 15.797530 26.998124 20.396617 ... NaN NaN NaN NaN 12.085333 NaN NaN NaN NaN NaN
std 20953.209402 10.297808 8.717817 8.334572 8.297982 1.136718 3.425561 8.794541 15.221732 5.560371 ... NaN NaN NaN NaN 5.731328 NaN NaN NaN NaN NaN
min 10001.000000 18.520000 -33.333333 -38.333333 -35.555556 40.000000 1.000000 1.000000 2.000000 10.000000 ... NaN NaN NaN NaN 0.000000 NaN NaN NaN NaN NaN
25% 11801.000000 29.632000 25.555556 15.000000 20.555556 43.000000 4.000000 8.000000 11.000000 16.000000 ... NaN NaN NaN NaN 8.500000 NaN NaN NaN NaN NaN
50% 22508.000000 37.040000 29.444444 21.111111 25.555556 44.000000 7.000000 16.000000 32.000000 20.000000 ... NaN NaN NaN NaN 11.600000 NaN NaN NaN NaN NaN
75% 33501.000000 43.059000 31.666667 23.333333 27.222222 45.000000 10.000000 23.000000 34.000000 23.250000 ... NaN NaN NaN NaN 15.000000 NaN NaN NaN NaN NaN
max 82506.000000 75.932000 50.000000 34.444444 40.000000 45.000000 12.000000 31.000000 78.000000 41.000000 ... NaN NaN NaN NaN 23.900000 NaN NaN NaN NaN NaN

8 rows × 24 columns

And finally, let’s plot our data points on a 2-D graph to eyeball our dataset and see if we can manually find any relationship between the data using the below script :


In [53]:
dataset.plot(x='MinTemp', y='MaxTemp', style='o')  
plt.title('MinTemp vs MaxTemp')  
plt.xlabel('MinTemp')  
plt.ylabel('MaxTemp')  
plt.show()


Let’s check the average max temperature and once we plot it we can observe that the Average Maximum Temperature is Between Nearly 25 and 35.


In [54]:
plt.figure(figsize=(15,10))
plt.tight_layout()
seabornInstance.distplot(dataset['MaxTemp'])


Out[54]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa3761e9a90>

Our next step is to divide the data into “attributes” and “labels”.

Attributes are the independent variables while labels are dependent variables whose values are to be predicted. In our dataset, we only have two columns. We want to predict the MaxTemp depending upon the MinTemp recorded. Therefore our attribute set will consist of the “MinTemp” column which is stored in the X variable, and the label will be the “MaxTemp” column which is stored in y variable.


In [55]:
X = dataset['MinTemp'].values.reshape(-1,1)
y = dataset['MaxTemp'].values.reshape(-1,1)

Next, we split 80% of the data to the training set while 20% of the data to test set using below code.

The test_size variable is where we actually specify the proportion of the test set.


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

After splitting the data into training and testing sets, finally, the time is to train our algorithm. For that, we need to import LinearRegression class, instantiate it, and call the fit() method along with our training data.


In [57]:
regressor = LinearRegression()  
regressor.fit(X_train, y_train) #training the algorithm


Out[57]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

As we have discussed that the linear regression model basically finds the best value for the intercept and slope, which results in a line that best fits the data. To see the value of the intercept and slop calculated by the linear regression algorithm for our dataset, execute the following code.


In [58]:
#To retrieve the intercept:
print(regressor.intercept_)

#For retrieving the slope:
print(regressor.coef_)


[10.66185201]
[[0.92033997]]

The result should be approximately 10.66185201 and 0.92033997 respectively.

This means that for every one unit of change in Min temperature, the change in the Max temperature is about 0.92%.

Now that we have trained our algorithm, it’s time to make some predictions. To do so, we will use our test data and see how accurately our algorithm predicts the percentage score. To make predictions on the test data, execute the following script:


In [59]:
y_pred = regressor.predict(X_test)

Now compare the actual output values for X_test with the predicted values, execute the following script:


In [60]:
df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
df


Out[60]:
Actual Predicted
0 28.888889 33.670351
1 31.111111 30.091251
2 27.222222 26.512151
3 28.888889 31.113851
4 23.333333 15.774852
5 37.222222 30.602551
6 22.222222 11.684452
7 35.555556 33.670351
8 30.555556 30.602551
9 28.888889 32.647751
10 24.444444 29.068651
11 22.777778 23.955652
12 30.555556 30.091251
13 26.111111 26.000851
14 27.222222 29.068651
15 30.555556 32.647751
16 10.555556 15.774852
17 32.222222 32.136451
18 29.444444 29.579951
19 23.333333 18.842652
20 30.555556 26.000851
21 43.333333 31.113851
22 27.222222 30.091251
23 33.333333 33.159051
24 31.111111 22.421752
25 30.000000 31.113851
26 33.333333 30.602551
27 31.111111 30.602551
28 33.333333 35.204251
29 36.111111 30.091251
... ... ...
23778 4.444444 9.639252
23779 31.111111 32.136451
23780 30.000000 32.136451
23781 34.444444 30.091251
23782 28.888889 32.647751
23783 32.222222 29.068651
23784 25.000000 15.774852
23785 26.666667 30.091251
23786 35.000000 32.136451
23787 15.000000 12.195752
23788 19.444444 22.421752
23789 15.000000 19.865252
23790 33.333333 25.489551
23791 27.777778 26.000851
23792 31.666667 30.091251
23793 31.666667 31.625151
23794 29.444444 31.113851
23795 29.444444 33.670351
23796 16.666667 17.820052
23797 20.000000 21.399152
23798 28.888889 32.136451
23799 31.111111 30.091251
23800 44.444444 35.715551
23801 36.666667 36.226851
23802 29.444444 33.159051
23803 32.777778 32.136451
23804 32.222222 29.068651
23805 31.111111 32.647751
23806 31.111111 30.602551
23807 36.666667 31.625151

23808 rows × 2 columns

We can also visualize comparison result as a bar graph using the below script :

Note: As the number of records is huge, for representation purpose I’m taking just 25 records.


In [61]:
df1 = df.head(25)
df1.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()


Though our model is not very precise, the predicted percentages are close to the actual ones.

Let's plot our straight line with the test data :


In [62]:
plt.scatter(X_test, y_test,  color='gray')
plt.plot(X_test, y_pred, color='red', linewidth=2)
plt.show()



In [63]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


Mean Absolute Error: 3.1993291783785445
Mean Squared Error: 17.631568097568472
Root Mean Squared Error: 4.198996082109208

Outro Estudo


In [64]:
# https://www.kdnuggets.com/2019/03/beginners-guide-linear-regression-python-scikit-learn.html/2

In [65]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: